knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=14,
fig.height=14)
set.seed(42)
library(tidyverse)
tx_gene_meta <- read_csv("./data/mm_ens97_metadata.txt")
Parsed with column specification:
cols(
tx_id = [31mcol_character()[39m,
gene_name = [31mcol_character()[39m,
gene_len = [32mcol_double()[39m,
premrna_len = [32mcol_double()[39m,
intronic_len = [32mcol_double()[39m,
exonic_len = [32mcol_double()[39m,
cds_len = [32mcol_double()[39m,
utr5_len = [32mcol_double()[39m,
utr3_len = [32mcol_double()[39m,
nexon = [32mcol_double()[39m,
nintron = [32mcol_double()[39m,
perc_gene_gc = [32mcol_double()[39m,
gene_id = [31mcol_character()[39m,
seqnames = [31mcol_character()[39m,
gene_biotype = [31mcol_character()[39m,
tx_biotype = [31mcol_character()[39m,
tx_id_version = [31mcol_character()[39m
)
tx_gene_meta
gene_meta <-
tx_gene_meta %>%
dplyr::group_by(gene_id) %>%
dplyr::slice(1) %>%
ungroup() %>%
select(gene_name, gene_len, perc_gene_gc, gene_id, seqnames, gene_biotype)
gene_meta
Some gene names are not unique
duplicated_gene_names <-
gene_meta %>%
dplyr::filter( duplicated(gene_name ))%>%
pull(gene_name)
duplicated_gene_names
[1] "C2cd6" "4930594M22Rik" "Gm16364" "Nron" "A530058N18Rik" "Snora43" "Dlx6os1" "Nkx2-2os" "1700030C10Rik" "Gm23925"
[11] "Pcdha11" "Gm2464" "1600017P15Rik" "Gm16701" "Ighv1-13" "Gm20690" "Gm7270" "Ighv5-8" "Gm4430" "Gm28724"
[21] "Gm36638" "Gm18433" "Gm23786" "Gm24022" "Mir1839" "Gm26379" "Mir677" "Gm24826" "Gm22149" "Rnu3b3"
[31] "Rnu3b4" "Gm23927" "Rnu3b1" "Gm25203" "Gm26457" "Rnu3b2" "Gm26413" "Gm23377" "Mir3068" "Gm25820"
[41] "Gm26265" "Mir1949" "Gm23604" "Snord80" "Gm24105" "Gm24350" "Gm25617" "Gm22711" "Gm22813" "Snora16a"
[51] "Gm23128" "Rnu3a" "Gm26047" "Rprl1" "Gm22897" "Gm27680" "Gm23370" "Gm25053" "Gm27013" "Zfp813-ps"
[61] "Gm16499" "Tmem147os" "Gm28710" "St6galnac2" "Gm18645" "Zkscan7" "Gm38642" "Grik2" "Olfr1073-ps1" "Scarna3b"
[71] "C730027H18Rik" "Gm6729" "Jakmip1" "Gm35558" "Gm27528" "Gm35558" "Gm9025" "Aldoa" "Dpep2" "Ndor1"
[81] "Gm5089" "Vmn2r-ps111" "Rmrp" "Gm6740" "Gm27825" "Mirt2" "Gm28040" "Sept2" "Vmn1r216" "Olfr290"
[91] "Gm38619" "Nnt" "Gm28023" "Gm41392" "Zc3h11a" "Gcat" "Ddit3" "Atp5o" "Gm5966" "Gm29719"
[101] "Ptp4a1" "Gm16386" "Gm27475" "Hdhd2" "Chtf8" "Ggnbp1" "Sssca1" "Arhgap26" "Arhgef4" "Fam220a"
[111] "Terc"
gene_meta %>%
dplyr::filter(gene_name%in% duplicated_gene_names ) %>%
arrange(gene_name)
biotype_count <-
gene_meta %>%
group_by(gene_biotype) %>%
tally(sort = TRUE)
biotype_count
biotype_targets <-
biotype_count %>%
slice(1:10)%>%
pull(gene_biotype)
ggplot(gene_meta %>%
filter(gene_biotype %in% biotype_targets),
aes(perc_gene_gc)) +
geom_histogram(bins = 150) +
facet_wrap(~gene_biotype)
ggplot(gene_meta %>%
filter(gene_biotype %in% biotype_targets) ,
aes(gene_len)) +
geom_histogram(bins = 150) +
scale_x_log10() +
facet_wrap(~gene_biotype)
ggplot(gene_meta %>%
filter(gene_biotype %in% biotype_targets),
aes(perc_gene_gc, gene_len))+
geom_point(size=0.8,
alpha=0.7) +
geom_density_2d() +
scale_y_log10() +
theme(legend.position="none") +
facet_wrap(~gene_biotype)
tx_biotype_count <-
tx_gene_meta %>%
group_by(tx_biotype) %>%
tally(sort = TRUE)
tx_biotype_count
tx_biotype_targets <-
tx_biotype_count %>%
slice(1:8)%>%
pull(tx_biotype)
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.3,
alpha=0.6) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets)) +
geom_point( aes(nexon, nintron, colour=gene_len),
size=0.8,
alpha=0.7) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype)
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets & nexon!=1),
aes( premrna_len, exonic_len, colour=nexon)) +
geom_point( size=0.1,
alpha=0.3) +
geom_density_2d() +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype)
ggplot(tx_gene_meta %>%
filter(tx_biotype %in% tx_biotype_targets ),
aes( nexon, exonic_len, colour=premrna_len)) +
geom_point( size=0.1,
alpha=0.3) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype)
count_tx_by_gene <-
tx_gene_meta %>%
count(gene_id,gene_name, gene_biotype) %>%
arrange(-n)
count_tx_by_gene
ggplot(count_tx_by_gene, aes(n)) +
geom_histogram(bins = 100)
top_genes <-
count_tx_by_gene %>%
slice(1:20) %>%
pull(gene_name)
top_genes
[1] "Rian" "Tcf4" "Gm29154" "3110039I08Rik" "Ank3" "Smarca2" "Pcdh15" "Crem" "Kcnma1" "Hand2os1"
[11] "Sorbs2" "Wdr45" "Nedd4l" "Adgrl3" "Ttc3" "Hnrnpk" "Qars" "U2af1l4" "Anks1b" "Arhgef2"
ggplot(tx_gene_meta %>%
dplyr::filter(gene_name%in% top_genes))+
geom_point( aes(exonic_len, intronic_len, colour=nexon >=10)) +
facet_wrap(gene_biotype~gene_name, scales = "free")
tx_10_genes <-
count_tx_by_gene %>%
filter(n==10) %>%
slice(1:20) %>%
pull(gene_name)
ggplot(tx_gene_meta %>%
dplyr::filter(gene_name%in% tx_10_genes))+
geom_point( aes(exonic_len, intronic_len, colour=nexon >=10)) +
facet_wrap(~gene_name, scales = "free")
NA
single_transcript_genes <-
count_tx_by_gene %>%
dplyr::filter(n==1) %>%
pull(gene_name)
length(single_transcript_genes)
[1] 34389
ggplot(tx_gene_meta %>%
dplyr::filter(gene_name%in% single_transcript_genes & gene_biotype %in% biotype_targets ))+
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
tx_gene_meta %>%
count(nexon ) %>%
arrange(-n)
ggplot(tx_gene_meta %>%
dplyr::filter(tx_biotype %in% tx_biotype_targets & nexon== 2)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
ggplot(tx_gene_meta %>%
dplyr::filter(tx_biotype %in% tx_biotype_targets & nexon< 10 & nexon> 1)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
ggplot(tx_gene_meta %>%
dplyr::filter(tx_biotype %in% tx_biotype_targets & nexon>= 10)) +
geom_point( aes(exonic_len, intronic_len, colour=perc_gene_gc),
size=0.2,
alpha=0.5) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~tx_biotype) +
scale_colour_gradient2(midpoint=10, mid = "yellow")
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.2 cellranger_1.1.0 pillar_1.4.2 compiler_3.6.1 tools_3.6.1 digest_0.6.20 zeallot_0.1.0 lubridate_1.7.4 jsonlite_1.6
[10] nlme_3.1-140 gtable_0.3.0 lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0 cli_1.1.0 rstudioapi_0.10 haven_2.1.1 xfun_0.8
[19] withr_2.1.2 xml2_1.2.2 httr_1.4.1 knitr_1.24 generics_0.0.2 vctrs_0.2.0 hms_0.5.0 rprojroot_1.3-2 grid_3.6.1
[28] tidyselect_0.2.5 glue_1.3.1 R6_2.4.0 readxl_1.3.1 modelr_0.1.5 magrittr_1.5 MASS_7.3-51.4 backports_1.1.4 scales_1.0.0
[37] rvest_0.3.4 assertthat_0.2.1 colorspace_1.4-1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 broom_0.5.2 crayon_1.3.4